Back to Table of Contents
# Asserting minimum system version
import sys
assert sys.version_info >=(3, 8)
# Importing required libraries
import pandas as pd
import numpy as np
import string # to create a list of letters
import random
import math
import re # to use regular expressions
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report, confusion_matrix, precision_score, recall_score
# Import and configure matplotlib
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from matplotlib.ticker import MaxNLocator # to format x-axis in plots
mpl.rc('figure', dpi=300) # setting plot resolution
import seaborn as sns # for visualisations
from IPython.display import display, HTML # for displaying reports
import pickle # to handle saving and loading objects
# Checking the GPU configuration
print(tf.config.list_physical_devices('GPU'))
[]
In this project we will be conducting experiments with a lot of different models that require significant computations. It is a very iterative and time-consuming process and therefore, it would be worthwhile to do some housekeeping first and define a few helpers - functions and objects that will allow us:
# Setting developer mode:
# if the developer mode is on, we will skip some time-consuming steps (i.e. model training) and
# instead load previously saved results.
# developer_mode = True # skip training, load saved results
developer_mode = False # train the model, save the results to disc
# Setting the path to `_models` folder
path_model = ".\_models\\"
# Defining a function to convert object name to a string
def object_to_string(obj):
# Searches global scope for variable names that point to `obj`.
for name, val in globals().items():
if val is obj:
return name
return "Object not found"
# Defining a ModelTracker - a class that will keep track of training different DNN models and store:
# 1. Model Name
# 2. Model Description
# 3. Model Object
# 4. Model Parameters (as a string)
# 5. History object
# 6. Test results
class ModelTracker:
def __init__(self, model_name, model_obj, model_desc, model_params, history, val_results, test_results):
assert isinstance(model_obj, keras.models.Model), 'model_obj should be an instance of keras.models.Model'
assert isinstance(model_params, str), 'model_params should be a string'
assert isinstance(history, dict), 'history should be a dictionary'
assert isinstance(val_results, list), 'val_results should be a list'
assert isinstance(test_results, list), 'test_results should be a list'
self.model_name = model_name
self.model_desc = model_desc
self.model_filepath = f'{path_model}{model_name}.h5' # save model in an h5 file
self.model_obj = model_obj.get_config()
self.model_params = model_params
self.history = history
self.val_results = val_results
self.test_results = test_results
# Save the keras model to file
def save_model(self, model_obj):
model_obj.save(self.model_filepath)
# Load the keras model from file
def load_model(self):
return keras.models.load_model(self.model_filepath)
# Save the ModelTracker object to file
def save(self, filepath):
with open(f'{path_model}{filepath}', 'wb') as file:
pickle.dump(self, file)
# Load the ModelTracker object from file
@staticmethod
def load(model_name):
filepath = f'{path_model}{model_name}.pkl'
with open(filepath, 'rb') as file:
return pickle.load(file)
# Defining a function to save a model and its results
def save_model_results(model_obj, model_desc, model_params, history, val_results, test_results, model_name = None):
if developer_mode == True:
print("The developer mode is ON - not saving the model to the disc, as it has already been saved.")
else:
# Resolving model's name
if model_name == None:
model_name = object_to_string(model_obj)
# Creating a ModelTracker oject
model_tracker = ModelTracker(model_name, model_obj, model_desc, model_params, history, val_results, test_results)
# Saving the Keras model to the disc
model_tracker.save_model(model_obj)
# Save the rest of the tracker object to the disc
model_tracker.save(f'{model_name}.pkl')
# Defining a function that loads results
def load_model_results(model_name):
# Load tracker data from disk
loaded_tracker = ModelTracker.load(model_name)
return (loaded_tracker)
# Defining a function that loads the model
def load_model(model_name):
loaded_tracker = ModelTracker.load(model_name)
# Load the Keras model
loaded_model = loaded_tracker.load_model()
return (loaded_model)
Back to Table of Contents
The data for this this project was sourced from Kaggle website. The dataset is provided in two CSV separate files (training and testing subsets), which contain images of hands in poses corresponding to letters in American Sign Language. The files contain columns label, pixel1, pixel2, ..., pixel784. Each row represents a single 28x28 pixel image with grayscale values between 0-255.
Each training and test case represents a label (0-25) as a one-to-one map for each alphabetic letter A-Z (and no cases for 9=J or 25=Z because of gesture motions).
# Setting file paths
path = ".\_data\sign_language\\"
file_train = path + 'sign_mnist_train.csv'
file_test = path + 'sign_mnist_test.csv'
# Loading the files as Pandas dataframes
df_train = pd.read_csv(file_train)
df_test = pd.read_csv(file_test)
# Checking the results
df_train.head()
| label | pixel1 | pixel2 | pixel3 | pixel4 | pixel5 | pixel6 | pixel7 | pixel8 | pixel9 | ... | pixel775 | pixel776 | pixel777 | pixel778 | pixel779 | pixel780 | pixel781 | pixel782 | pixel783 | pixel784 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 3 | 107 | 118 | 127 | 134 | 139 | 143 | 146 | 150 | 153 | ... | 207 | 207 | 207 | 207 | 206 | 206 | 206 | 204 | 203 | 202 |
| 1 | 6 | 155 | 157 | 156 | 156 | 156 | 157 | 156 | 158 | 158 | ... | 69 | 149 | 128 | 87 | 94 | 163 | 175 | 103 | 135 | 149 |
| 2 | 2 | 187 | 188 | 188 | 187 | 187 | 186 | 187 | 188 | 187 | ... | 202 | 201 | 200 | 199 | 198 | 199 | 198 | 195 | 194 | 195 |
| 3 | 2 | 211 | 211 | 212 | 212 | 211 | 210 | 211 | 210 | 210 | ... | 235 | 234 | 233 | 231 | 230 | 226 | 225 | 222 | 229 | 163 |
| 4 | 13 | 164 | 167 | 170 | 172 | 176 | 179 | 180 | 184 | 185 | ... | 92 | 105 | 105 | 108 | 133 | 163 | 157 | 163 | 164 | 179 |
5 rows × 785 columns
df_test.tail()
| label | pixel1 | pixel2 | pixel3 | pixel4 | pixel5 | pixel6 | pixel7 | pixel8 | pixel9 | ... | pixel775 | pixel776 | pixel777 | pixel778 | pixel779 | pixel780 | pixel781 | pixel782 | pixel783 | pixel784 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 7167 | 1 | 135 | 119 | 108 | 102 | 105 | 99 | 61 | 103 | 121 | ... | 108 | 112 | 116 | 114 | 118 | 180 | 184 | 176 | 167 | 163 |
| 7168 | 12 | 157 | 159 | 161 | 164 | 166 | 166 | 171 | 174 | 175 | ... | 213 | 213 | 213 | 214 | 213 | 211 | 210 | 210 | 209 | 208 |
| 7169 | 2 | 190 | 191 | 190 | 191 | 190 | 190 | 192 | 192 | 191 | ... | 216 | 215 | 213 | 214 | 214 | 213 | 210 | 211 | 209 | 208 |
| 7170 | 4 | 201 | 205 | 208 | 209 | 214 | 216 | 218 | 223 | 226 | ... | 112 | 169 | 255 | 255 | 237 | 113 | 91 | 67 | 70 | 63 |
| 7171 | 2 | 173 | 174 | 173 | 174 | 173 | 173 | 175 | 175 | 174 | ... | 201 | 200 | 197 | 198 | 198 | 197 | 195 | 195 | 193 | 192 |
5 rows × 785 columns
# Defining a function that gets dataframe dimensions
def get_df_dimensions(df, description):
# Getting the number of rows
num_rows = df.shape[0]
# Getting the number of columns
num_cols = df.shape[1]
print(f'The {description} dataset contains {num_rows:,} rows and {num_cols:,} columns.')
return(num_rows, num_cols)
# Checking the size of the training dataset
num_rows_train, num_cols_train = get_df_dimensions(df_train, 'training')
The training dataset contains 27,455 rows and 785 columns.
# Checking the size of the test dataset
num_rows_test, num_cols_test = get_df_dimensions(df_test, 'test')
The test dataset contains 7,172 rows and 785 columns.
# Defining a function to display dataset summary
def dataset_summary(list_of_datasets, list_of_dataset_labels):
# Creating an empty list of rows count for each dataset + 1 (for total)
count = [0] * (len(list_of_datasets) + 1)
percent = [0] * (len(list_of_datasets) + 1)
# Adding the 'Total' label as the last row
list_of_dataset_labels.append('Total')
# Creating a variable to keep track of the running total
count_total = 0
# Updating the Count column
for i, dataset in enumerate(list_of_datasets):
count[i] = dataset.shape[0]
# Update the last element with the total number
count_total += count[i]
count[-1] = count_total
# Calculating and updatin the Percent column
for i, current_count in enumerate(count):
percent[i] = current_count / count_total * 100
# Reformatting to string
percent[i] = f'{percent[i]:.2f}%'
df_rows = pd.DataFrame({
'Dataset': list_of_dataset_labels,
'Count': count,
'Percent': percent
})
return (df_rows)
dataset_summary([df_train, df_test], ['Training','Test'])
| Dataset | Count | Percent | |
|---|---|---|---|
| 0 | Training | 27455 | 79.29% |
| 1 | Test | 7172 | 20.71% |
| 2 | Total | 34627 | 100.00% |
# Creating a list of uppercase letters
letters = [letter for letter in string.ascii_uppercase]
# Creating a DataFrame of letters
df_letters = pd.DataFrame(letters, columns = ['Letter'])
# Renaming the index
df_letters = df_letters.rename_axis('Label')
# Defining a function to get statistics for each label
def get_label_statistics(df, description):
# Extracting label names and sorting them in alphabetical order
labels = sorted(df['label'].unique())
# Initialising a dictionary to store the statistics
stats = {}
# Iterating over labels and calculating statistics
for label in labels:
label_data = df[df['label'] == label] # filtering rows by label
pixel_columns = [col for col in df.columns if 'pixel' in col] # getting pixel column names
# Calculating statistics
count = len(label_data)
freq = count / len(df)
min_vals = label_data[pixel_columns].min().mean()
max_vals = label_data[pixel_columns].max().mean()
mean_vals = label_data[pixel_columns].mean().mean()
std_dev = label_data[pixel_columns].std().mean()
# Storing the statistics in the dictionary
stats[label] = {
f'Count ({description.title()})': count,
f'Frequency ({description.title()})': f'{freq*100:.2f}%',
f'Avg Min ({description.title()})': f'{min_vals:.2f}',
f'Avg Max ({description.title()})': f'{max_vals:.2f}',
f'Mean ({description.title()})': f'{mean_vals:.2f}',
f'St Dev ({description.title()})': f'{std_dev:.2f}'
}
# Converting the dictionary to a DataFrame and returning it
df_stats = pd.DataFrame.from_dict(stats, orient='index')
# Renaming the index
df_stats = df_stats.rename_axis('Label')
return df_stats
# Calculating statistics for the labels
df_labels_train = get_label_statistics(df_train, "training")
df_labels_test = get_label_statistics(df_test, "test")
# Merging the training and testing dataframes on the 'label' column
df_labels = df_labels_train.merge(df_labels_test, on='Label', how='outer')
# Merging the labels dataframe with the letters dataframe defined earlier
df_labels = df_labels.merge(df_letters, on = 'Label', how = 'outer')
df_labels['Count (Total)'] = df_labels['Count (Training)'] + df_labels['Count (Test)']
df_labels['Frequency (Total)'] = df_labels['Count (Total)'] / df_labels['Count (Total)'].sum()
# Converting Count columns to integer type
df_labels['Count (Training)'] = df_labels['Count (Training)'].fillna(0).astype(int)
df_labels['Count (Test)'] = df_labels['Count (Test)'].fillna(0).astype(int)
df_labels['Count (Total)'] = df_labels['Count (Total)'].fillna(0).astype(int)
# Setting option to display percentage format
pd.options.display.float_format = '{:.2%}'.format
# Reordering columns by index
df_labels = df_labels.iloc[:, [12, 0, 1, 6, 7, 13, 14, 2, 3, 4, 5, 8, 9, 10, 11]]
# Sorting the dataframe by index
df_labels = df_labels.sort_index(ascending = True)
df_labels
| Letter | Count (Training) | Frequency (Training) | Count (Test) | Frequency (Test) | Count (Total) | Frequency (Total) | Avg Min (Training) | Avg Max (Training) | Mean (Training) | St Dev (Training) | Avg Min (Test) | Avg Max (Test) | Mean (Test) | St Dev (Test) | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Label | |||||||||||||||
| 0 | A | 1126 | 4.10% | 331 | 4.62% | 1457 | 4.21% | 31.04 | 252.07 | 160.60 | 40.52 | 73.73 | 246.84 | 168.10 | 36.16 |
| 1 | B | 1010 | 3.68% | 432 | 6.02% | 1442 | 4.16% | 35.05 | 251.15 | 162.18 | 39.15 | 46.44 | 248.42 | 161.72 | 40.75 |
| 2 | C | 1144 | 4.17% | 310 | 4.32% | 1454 | 4.20% | 40.55 | 247.32 | 156.04 | 39.47 | 50.95 | 244.96 | 157.24 | 38.83 |
| 3 | D | 1196 | 4.36% | 245 | 3.42% | 1441 | 4.16% | 40.15 | 250.06 | 161.08 | 39.14 | 53.41 | 241.06 | 156.73 | 39.69 |
| 4 | E | 957 | 3.49% | 498 | 6.94% | 1455 | 4.20% | 52.47 | 253.33 | 164.40 | 39.88 | 58.50 | 250.46 | 161.61 | 37.28 |
| 5 | F | 1204 | 4.39% | 247 | 3.44% | 1451 | 4.19% | 28.73 | 253.26 | 157.88 | 41.74 | 59.37 | 247.66 | 163.09 | 37.83 |
| 6 | G | 1090 | 3.97% | 348 | 4.85% | 1438 | 4.15% | 45.52 | 253.99 | 160.46 | 41.56 | 59.98 | 253.16 | 164.66 | 42.20 |
| 7 | H | 1013 | 3.69% | 436 | 6.08% | 1449 | 4.18% | 37.14 | 254.01 | 159.11 | 44.12 | 42.70 | 253.43 | 159.17 | 46.39 |
| 8 | I | 1162 | 4.23% | 288 | 4.02% | 1450 | 4.19% | 40.40 | 252.87 | 168.19 | 38.48 | 58.34 | 252.72 | 165.63 | 42.09 |
| 9 | J | 0 | NaN | 0 | NaN | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 10 | K | 1114 | 4.06% | 331 | 4.62% | 1445 | 4.17% | 27.20 | 249.44 | 149.82 | 43.68 | 41.05 | 243.11 | 149.35 | 41.40 |
| 11 | L | 1241 | 4.52% | 209 | 2.91% | 1450 | 4.19% | 36.73 | 252.18 | 162.42 | 39.42 | 66.94 | 243.24 | 159.87 | 39.38 |
| 12 | M | 1055 | 3.84% | 394 | 5.49% | 1449 | 4.18% | 42.77 | 254.75 | 162.06 | 40.63 | 50.32 | 249.78 | 162.32 | 42.17 |
| 13 | N | 1151 | 4.19% | 291 | 4.06% | 1442 | 4.16% | 38.89 | 254.44 | 163.02 | 41.89 | 62.43 | 253.76 | 168.82 | 40.39 |
| 14 | O | 1196 | 4.36% | 246 | 3.43% | 1442 | 4.16% | 34.36 | 253.74 | 156.90 | 41.70 | 60.78 | 249.29 | 156.78 | 41.53 |
| 15 | P | 1088 | 3.96% | 347 | 4.84% | 1435 | 4.14% | 10.92 | 247.48 | 130.74 | 44.48 | 27.93 | 232.40 | 128.19 | 39.63 |
| 16 | Q | 1279 | 4.66% | 164 | 2.29% | 1443 | 4.17% | 11.63 | 251.60 | 134.38 | 50.12 | 41.00 | 239.35 | 139.37 | 48.11 |
| 17 | R | 1294 | 4.71% | 144 | 2.01% | 1438 | 4.15% | 40.32 | 252.23 | 164.63 | 37.94 | 93.16 | 244.00 | 169.50 | 32.60 |
| 18 | S | 1199 | 4.37% | 246 | 3.43% | 1445 | 4.17% | 37.86 | 254.62 | 165.35 | 40.95 | 50.05 | 247.54 | 160.49 | 41.98 |
| 19 | T | 1186 | 4.32% | 248 | 3.46% | 1434 | 4.14% | 45.91 | 253.88 | 169.60 | 38.04 | 59.26 | 248.11 | 166.86 | 37.82 |
| 20 | U | 1161 | 4.23% | 266 | 3.71% | 1427 | 4.12% | 28.02 | 250.01 | 161.36 | 40.23 | 67.70 | 248.47 | 171.97 | 37.70 |
| 21 | V | 1082 | 3.94% | 346 | 4.82% | 1428 | 4.12% | 38.50 | 253.56 | 167.31 | 39.38 | 48.50 | 249.19 | 165.56 | 39.17 |
| 22 | W | 1225 | 4.46% | 206 | 2.87% | 1431 | 4.13% | 20.47 | 253.04 | 159.55 | 43.64 | 65.54 | 246.64 | 163.06 | 38.69 |
| 23 | X | 1164 | 4.24% | 267 | 3.72% | 1431 | 4.13% | 28.97 | 254.21 | 161.40 | 44.35 | 50.93 | 252.99 | 167.00 | 44.25 |
| 24 | Y | 1118 | 4.07% | 332 | 4.63% | 1450 | 4.19% | 32.68 | 253.88 | 166.05 | 41.14 | 56.29 | 251.37 | 167.15 | 40.97 |
| 25 | Z | 0 | NaN | 0 | NaN | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
# Resetting the display option back to the default
pd.reset_option('display.float_format')
# Building a bar chart to display label count accross training and test datasets for each class
ax = df_labels[['Count (Test)', 'Count (Training)']].plot(kind="barh", color = ['red', 'blue'], figsize = (8,6))
# Setting the title
plt.title('Count of Labels in Training and Test Datasets')
# Setting the Axes labels
plt.xlabel('Count')
plt.ylabel('Letter')
# Seting the y-labels using values from the 'Labels' column
ax.set_yticklabels(df_labels['Letter'])
# Reversing the y-axis
ax.invert_yaxis()
# Adding the grid
plt.grid(True)
# Moving the legend
plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.1), ncol=3)
# Displaying the graph
plt.show()
We can see that in both training and testing datasets the classes are not represented evenly. Letters J and Z are absent, as they require gesture motions.
# Creating dataframes with features (X) and labels (y)
y_train = df_train['label']
X_train = df_train.drop(columns = 'label')
y_test_full = df_test['label']
X_test_full = df_test.drop(columns = 'label')
# Getting the image dimensions assuming it is a square
XSIZE = int(math.sqrt(X_train.shape[1]))
YSIZE = XSIZE
# Getting unique labels in the training dataset
unique_labels = np.unique(y_train)
print (f'Unique labels: {unique_labels}')
# Counting the number of classes in the training dataset
unique_labels_count = len(unique_labels)
print(f'Number of classes: {unique_labels_count}')
Unique labels: [ 0 1 2 3 4 5 6 7 8 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24] Number of classes: 24
# Defining a function that will display a random sample of each class
def display_random_samples():
# Creating an empty list to store examples of each class
examples = []
# Selecting a random number for sampling between 0 and the minimum number of instances of a non-zero class
n = random.randint(0,df_labels[df_labels['Count (Training)'] !=0]['Count (Training)'].min())
# Iterating over the unique labels and finding the n-th example of each class
for label in unique_labels:
index = np.where(y_train == label)[0][n]
image = X_train.iloc[index].to_numpy().reshape(28, 28)
description = df_labels.loc[label,'Letter']
examples.append((image, label, description))
# Displaying the examples
num_cols = 6
num_rows = math.ceil(unique_labels_count / num_cols)
fig, axs = plt.subplots(num_rows, num_cols, figsize=(12, 6))
axs = axs.ravel()
for i, example in enumerate(examples):
axs[i].imshow(example[0], cmap='gray')
axs[i].set_title(f'Label: {example[1]} ({example[2]})')
axs[i].axis('off')
plt.tight_layout()
plt.show()
display_random_samples()
Back to Table of Contents
# Looking at the range of the intensities in the input data (X_train)
print((np.min(X_train),np.max(X_train)))
(0, 255)
The input values range from 0 to 255. We will need to scale them back to [0, 1] range, which in the context of Deep Neural Networks is considered beneficial for a number of reasons (promotes stable training, efficient optimisation, and improved generalisation capabilities).
# Initiating MinMaxScaler object
sca = MinMaxScaler()
# Fitting the scaler to the training dataset
X_train_scaled = sca.fit_transform(X_train)
# Fitting the scaler to the test dataset
X_test_full_scaled = sca.fit_transform(X_test_full)
# Checking the results
print((np.min(X_train_scaled),np.max(X_train_scaled)))
(0.0, 1.0)
Back to Table of Contents
As we have observed previously, labels 9 (representing J) and 25 (representing Z) are missing from the dataset, as both J and Z require motion gestures in American Sign Language.
Since our labels represent categorical data, we will apply one-hot encoding - a process that turns categorical data into a format that works better with machine learning algorithms. As a result, each label will be represented in a binary-vector format.
# Performing one-hot encoding on the labels:
# We add 2 missing classes and subscruct 1 to get (N-1) number of classes for one-hot encoding
y_train_encoded = keras.utils.to_categorical(y_train, unique_labels_count + 1)
y_test_full_encoded = keras.utils.to_categorical(y_test_full, unique_labels_count + 1)
# Checking the result
print(y_train_encoded)
[[0. 0. 0. ... 0. 0. 0.] [0. 0. 0. ... 0. 0. 0.] [0. 0. 1. ... 0. 0. 0.] ... [0. 0. 0. ... 0. 0. 0.] [0. 0. 0. ... 0. 0. 0.] [0. 0. 0. ... 0. 1. 0.]]
y_train_encoded.shape
(27455, 25)
Back to Table of Contents
# Splitting the original test dataset further into test and validation sets using 50/50 split.
# We keep all of the training dataset for training.
X_test_scaled, X_valid_scaled, y_test, y_valid = train_test_split(X_test_full_scaled,
y_test_full,
train_size = 0.5,
random_state = 42,
stratify = y_test_full # stratify on the target
)
# Performing one-hot encoding on the labels after splitting
# We add 2 missing classes and subscruct 1 to get (N-1) number of classes for one-hot encoding
y_test_encoded = keras.utils.to_categorical(y_test, unique_labels_count + 1)
y_valid_encoded = keras.utils.to_categorical(y_valid, unique_labels_count + 1)
dataset_summary([X_train_scaled, X_valid_scaled, X_test_scaled], ['Training', 'Validation', 'Test'])
| Dataset | Count | Percent | |
|---|---|---|---|
| 0 | Training | 27455 | 79.29% |
| 1 | Validation | 3586 | 10.36% |
| 2 | Test | 3586 | 10.36% |
| 3 | Total | 34627 | 100.00% |
We need to reshape our input arrays from 2D to 3D using the image dimensions XSIZE and YSIZE we established earlier.
# Checking the shape before
X_train_scaled.shape
(27455, 784)
# Reshaping inputs
X_train_scaled = X_train_scaled.reshape(-1, XSIZE, YSIZE)
X_valid_scaled = X_valid_scaled.reshape(-1, XSIZE, YSIZE)
X_test_scaled = X_test_scaled.reshape(-1, XSIZE, YSIZE)
# Checking the shape after
X_train_scaled.shape
(27455, 28, 28)
Back to Table of Contents
We are splitting the test set up rather than the training set in this case due to the way how the original dataset was constructed and provided.
The dataset was originally pre-split into a training and a test set, as it allows to have a standardised test set that everyone can use to evaluate and compare their models. This way, the performance of different models can be compared on exactly the same data, ensuring a fair comparison.
(Typically, we would split the provided test dataset into a validation set and a test set when our original data is already split into a training set and a test set, and we want a separate validation set for model tuning.)
An alternative approach would be to combine the provided training and test sets into one large dataset, and then split this dataset into training, validation, and test sets as needed.
Back to Table of Contents
# Setting the parameters for the first convolutional network
# Setting the input shape based on the dimensions of the images
input_shape = XSIZE, YSIZE, 1 # image width, image height, number of channels
# Setting the configuration of hidden layers - number of filters for each layer
hidden_layers_config = [32, 64, 128]
# Setting the kernel size
kernel_size = 3 # a single integer specifies the samel value for all special dimensions of the kernel
# Setting activation function for hidden layers
activation_hidden = 'ReLU'
# Setting optimisation technique
optimisation_technique = "nadam"
# Setting activation function for the final layer
activation_out = "Softmax"
# Setting the output size
output_size = y_train_encoded.shape[1]
# Setting initial learning rate
learning_rate = 0.01
# Setting the loss function
loss = 'CategoricalCrossentropy' # we have multiple classes and our labels are one-hot encoded
# Setting maximum number of epochs
epochs_max = 30
# Defining a function to build CNN models with different configutations
def build_cnn_model(hidden_layers_config = hidden_layers_config,
kernel_size = kernel_size,
l2_rate = 0, # setting the default L2 regularisation to 0
activation_hidden = activation_hidden,
activation_out = activation_out,
input_shape = input_shape,
output_size = output_size,
optimisation_technique = optimisation_technique,
loss = loss, # loss function
display_summary = True,
kernel_initializer = 'he_uniform'
):
# Initialising a sequential model
model = keras.models.Sequential()
# Creating blocks of hidden layers using the specified configuration
for i, hidden_layer_size in enumerate(hidden_layers_config):
# Setting parameters for the convolutional layer
params = {'filters': hidden_layer_size,
'kernel_size': kernel_size,
'kernel_regularizer' : keras.regularizers.l2(l2_rate),
'strides': 1,
'padding':'same',
'kernel_initializer': kernel_initializer
}
# Adding input shape parameter for the first convolutional layer
if i == 0:
params['input_shape'] = input_shape
# Adding the convolutional layer with the specified parameters
model.add(keras.layers.Conv2D(**params))
# Placing a Batch Normalisation Layer
model.add(keras.layers.BatchNormalization())
# Applying non-linearity using the specified activation function for hidden layers
model.add(keras.layers.Activation(activation_hidden))
# Adding a pooling layer
model.add(keras.layers.MaxPooling2D(pool_size = 2))
# Flattening the output
model.add(keras.layers.Flatten())
# Adding the final dense layer
model.add(keras.layers.Dense(output_size, activation = activation_out))
# Setting the optimizer
# Define the optimizer based on the provided parameter
if optimisation_technique.lower() == "adam":
optimizer = keras.optimizers.Adam()
elif optimisation_technique.lower() == "sgd":
optimizer = keras.optimizers.SGD()
elif optimisation_technique.lower() == "rmsprop":
optimizer = keras.optimizers.RMSprop()
elif optimisation_technique.lower() == "nadam":
optimizer = keras.optimizers.Nadam()
elif optimisation_technique.lower() == "adamax":
optimizer = keras.optimizers.Adamax()
else:
raise ValueError("Invalid optimizer provided")
# Compiling the model
model.compile(loss = loss,
optimizer = optimizer,
metrics = ["accuracy"]) # Including the additional metric of 'accuracy'
# Building the parameters string
params = f'Nodes: {hidden_layers_config}\n\
L2 Weight: {l2_rate}\n\
Activation (Hidden): {activation_hidden}\n\
Activation (Out): {activation_out}'
if display_summary:
model.summary()
print(params)
return(model, params)
model_cnn_1, params_cnn_1 = build_cnn_model()
Model: "sequential"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
conv2d (Conv2D) (None, 28, 28, 32) 320
batch_normalization (BatchN (None, 28, 28, 32) 128
ormalization)
activation (Activation) (None, 28, 28, 32) 0
max_pooling2d (MaxPooling2D (None, 14, 14, 32) 0
)
conv2d_1 (Conv2D) (None, 14, 14, 64) 18496
batch_normalization_1 (Batc (None, 14, 14, 64) 256
hNormalization)
activation_1 (Activation) (None, 14, 14, 64) 0
max_pooling2d_1 (MaxPooling (None, 7, 7, 64) 0
2D)
conv2d_2 (Conv2D) (None, 7, 7, 128) 73856
batch_normalization_2 (Batc (None, 7, 7, 128) 512
hNormalization)
activation_2 (Activation) (None, 7, 7, 128) 0
max_pooling2d_2 (MaxPooling (None, 3, 3, 128) 0
2D)
flatten (Flatten) (None, 1152) 0
dense (Dense) (None, 25) 28825
=================================================================
Total params: 122,393
Trainable params: 121,945
Non-trainable params: 448
_________________________________________________________________
Nodes: [32, 64, 128]
L2 Weight: 0
Activation (Hidden): ReLU
Activation (Out): Softmax
keras.utils.plot_model(model_cnn_1, ".\_images\model_cnn_1.png")
Back to Table of Contents
Batch Normalization (BatchNorm) is a technique used to increase the stability of a neural network. It normalises the input layer by adjusting and scaling the activations.
There are three options for placement of the BatchNorm layers:
Each option has its own pros and cons.
In a Convolutional Neural Network (CNN), the general practice is to place Batch Normalization (BatchNorm) layers after the Convolutional layers but before Activation Layer, which applies non-linearity. This has been demonstrated to aid in the network's learning and generalisation processes. This helps to deal with the issue of internal covariate shift (changes in the distribution of network activations due to the change in network parameters during training).
Back to Table of Contents
Before we train our model, let's define another function train_model() that will take a model object, as well as batch size and number of epochs as an input, will fit the model to the training and validation data, then evaluate it on the test data and return the model's history object and summary results on the test data.
We will set a default value for number of epochs to 30.
We will also include a callback for early stopping (early_stopping) as input and set a default value for it.
To monitor early stopping we will use validation loss as it is generally more sensitive to overfitting than accuracy.
The patience parameter is important as it helps to avoid stopping too early due to minor fluctuations or noise in the validation loss or accuracy. We will use a default value of 3 for patience.
We will set restore_best_weights = True to ensure that our model has the weights that resulted in the best performance on the validation set during training.
We selected Nadam optimiser previously in section 2.1. Building a CNN Using Keras.
To implement exponential learning rate schedule we will define an internal function that will calculate exponential decay for the learning rate.
def train_model(model, # the neural network model to be trained
model_desc, # model's description
params, # parameters defined when building the model
learning_rate = learning_rate, # initial learning rate
epochs = epochs_max, # number of epochs (default is the max number defined earlier)
batch_size = 512, # number of samples that are propagated through the neural network during each training iteration
training_rerun = False, # A flag for re-running training
early_stopping_cb = EarlyStopping(monitor='val_loss',
patience = 3,
restore_best_weights = True), # Default parameters for early stopping callback
model_name = None # name of the model
):
# Implementing early rate scheduler
# Defining exponential decay function
def exp_decay(epoch,learning_rate):
# During the first call, initialise, otherwise use the current value.
if not hasattr(exp_decay,'s'):
exp_decay.s=1
if not hasattr(exp_decay,'eta0'):
exp_decay.eta0=learning_rate
return exp_decay.eta0 * (0.1**(epoch/exp_decay.s))
# Setting the initial decay rate
exp_decay.s=5
# Setting the initial learning rate
exp_decay.eta0=learning_rate
# Creating a callback using the learning rate scheduler
lr_scheduler_cb = tf.keras.callbacks.LearningRateScheduler(exp_decay)
# Resolving the model name:
if model_name == None:
model_name = object_to_string(model)
if developer_mode == False:
# Checking if the Keras session needs to be reset (when not re-running the training session)
if not training_rerun:
# Clearing the Keras session
keras.backend.clear_session()
# Assigning learning reate scheduler and early stopping callbacks
callbacks = [lr_scheduler_cb, early_stopping_cb]
# Fitting the model
history_obj = model.fit(X_train_scaled,
y_train_encoded,
epochs = epochs,
batch_size = batch_size,
validation_data = (X_valid_scaled, y_valid_encoded),
callbacks = callbacks,
verbose = 0)
# Getting the history
history = history_obj.history
# We save the last values from the validation loss and validation accuracy curves as validation loss and accuracy
val_results = [history['val_loss'][-1], history['val_accuracy'][-1]]
# Evaluating the model
test_results = model.evaluate(X_test_scaled, y_test_encoded, verbose = 0)
# Saving the model and results
save_model_results(model, model_desc, params, history, val_results, test_results, model_name)
return (print(f'Model {model_name} has been trained successfully.'))
else:
return (print(f'The developer mode is ON.\n{model_name} has been trained and saved previosly.\nSkipping training.'))
# Training the CNN model
train_model(model = model_cnn_1, model_desc = 'CNN Model (Section 2)', params = params_cnn_1)
Model model_cnn_1 has been trained successfully.
Back to Table of Contents
First, we will define a function that takes the following inputs:
The function prepares the data and builds the graph, displaying the learning curves for loss and accuracy for training and testing data.
# Defining a function to plot learning curves
def plot_learning_curves(model_name):
# Retreiving model's history, parameters and description
model = load_model_results(model_name)
model_desc = model.model_desc
history = model.history
params = model.model_params
# Setting colour palette
# Getting the 'tab10' color palette
colours = plt.get_cmap('tab10').colors
# Select the first n colors
colours = colours[:4]
# Extracting the training and validation metrics from the history object
train_loss = history['loss']
val_loss = history['val_loss']
train_accuracy = history['accuracy']
val_accuracy = history['val_accuracy']
# Creating a figure and axes for the plot
fig, ax_loss = plt.subplots(figsize = (12 , 8))
# Plotting the training and validation loss
line_loss_training = ax_loss.plot(train_loss, label='Training Loss', linestyle = '--', color = colours[0])
line_loss_validation = ax_loss.plot(val_loss, label='Validation Loss', linestyle = '--', color = colours[2])
# Creating a second group of plots for accuracy sharing the same x-axis
ax_acc = ax_loss.twinx()
# Plotting the training and validation accuracy
line_acc_training = ax_acc.plot(train_accuracy, label='Training Accuracy', color = colours[1])
line_acc_validation = ax_acc.plot(val_accuracy, label='Validation Accuracy', color = colours[3])
# Defining the custom formatter for the second y-axis
def percentage_formatter(x, pos):
return '{:.0%}'.format(x)
# Applying the custom formatter to the second y-axis
ax_acc.yaxis.set_major_formatter(ticker.FuncFormatter(percentage_formatter))
# Setting the labels and title
ax_loss.set_xlabel('Epochs')
ax_loss.set_ylabel('Loss')
ax_loss.set_title(f'Learning Curves: {model_desc}')
ax_acc.set_ylabel('Accuracy')
# Formatting axis
ax_loss.xaxis.set_major_locator(MaxNLocator(integer=True)) # setting ticks on x-axis to integer values (epochs)
ax_loss.set_xlim(0, epochs_max) # setting limits for the x-axis from 0 to max number of epochs for consistency
ax_acc.set_ylim(0,1) # setting limits for the y-axis for accuracy
# Adding a legend
lines_loss = line_loss_training + line_loss_validation
labels_loss = [line.get_label() for line in lines_loss]
lines_acc = line_acc_training + line_acc_validation
labels_acc = [line.get_label() for line in lines_acc]
ax_loss.legend(lines_loss, labels_loss, loc='upper center', bbox_to_anchor=(0.1, -0.1), ncol=1)
ax_acc.legend(lines_acc, labels_acc, loc='upper center', bbox_to_anchor=(0.9, -0.1), ncol=1)
# Showing the grid
plt.grid(True, linestyle='--', linewidth=0.5, color='gray', alpha=0.5)
# Adding vertical grid lines
ax_loss.grid(axis='x', linestyle='--', linewidth=0.5, color='gray', alpha=0.5)
# Adding caption
plt.subplots_adjust(bottom=0.25) # adjusting the bottom upwards to make room for caption
plt.figtext(0.5, 0.05, f"Model Parameters\n{params}", ha="center", fontsize=10, bbox={"facecolor":"orange", "alpha":0.1, "pad":5})
# Showing the plot
plt.show()
# Defining a custom function to display test results
def display_evaluation_results(model_name, mode = 1):
# Setting number of decimal points for rounding
decimals = 3
# Printing the header
display(HTML(f'<h4>Evaluation Results ({model_name})</h4>'))
display(HTML("<hr>"))
def print_results():
print(f' {label} Loss: {np.round(results[0], decimals = decimals)}\
\n {label} Accuracy: {np.round(results[1]*100, decimals = decimals - 2)}%')
display(HTML("<hr>"))
# Checking the mode:
# 1 - Test
# 2 - Validation
# 3 - Both (Test and Validation)
if mode == 1: # Test
results = load_model_results(model_name).test_results
label = "Test"
elif mode == 2: # Validation
results = load_model_results(model_name).val_results
label = "Validation"
else:
results = load_model_results(model_name).val_results
label = "Validation"
print_results()
results = load_model_results(model_name).test_results
label = "Test"
print_results()
# Plotting the learning curves for the model
plot_learning_curves('model_cnn_1')
display_evaluation_results('model_cnn_1')
Test Loss: 0.11 Test Accuracy: 96.2%
From the training curve we can observe that our CNN model initially produces very little decrease for the validation loss and takes several epochs for the validation accuracy to achieve levels higher than 80%. In other words, because of so many hidden layers, it takes a while for the model to learn (to adjust the weights and biases) in order to achieve reasonable performance.
# Getting predicted probabilities for each class
y_pred_prob = model_cnn_1.predict(X_test_scaled)
113/113 [==============================] - 1s 7ms/step
# Assigning the label class with the highest probability to each sample
y_pred = np.argmax(y_pred_prob, axis=1)
# Creating a list of formated class labels
# Filtering the letters on classes that exist in the dataset
# by looking up 'Letter' value from df_labels based on the index taken from the unique_labels list
unique_labels_description = [df_labels.loc[df_labels.index == item, 'Letter'].values[0] for item in unique_labels]
# Producing a list of class labels that include the target value and the letter
class_labels = [f'{desc} ({label})' for label, desc in zip(unique_labels, unique_labels_description)]
# Printing the metrics to measure the performance of the classifier
print(classification_report(y_test,
y_pred,
target_names = class_labels
))
precision recall f1-score support
A (0) 1.00 1.00 1.00 166
B (1) 1.00 0.98 0.99 216
C (2) 1.00 1.00 1.00 155
D (3) 0.98 0.98 0.98 122
E (4) 0.93 1.00 0.96 249
F (5) 1.00 1.00 1.00 124
G (6) 1.00 0.97 0.98 174
H (7) 1.00 1.00 1.00 218
I (8) 0.94 0.92 0.93 144
K (10) 1.00 0.94 0.97 166
L (11) 0.97 1.00 0.99 104
M (12) 0.99 0.95 0.97 197
N (13) 1.00 0.90 0.95 145
O (14) 1.00 1.00 1.00 123
P (15) 0.98 1.00 0.99 173
Q (16) 0.98 1.00 0.99 82
R (17) 1.00 0.88 0.93 72
S (18) 0.98 0.93 0.96 123
T (19) 0.80 0.78 0.79 124
U (20) 1.00 0.98 0.99 133
V (21) 0.94 0.92 0.93 173
W (22) 0.84 1.00 0.92 103
X (23) 0.84 0.91 0.87 134
Y (24) 0.93 0.99 0.96 166
accuracy 0.96 3586
macro avg 0.96 0.96 0.96 3586
weighted avg 0.96 0.96 0.96 3586
# Defining a fuction to plot the Confusion Matrix
def plot_confusion_matrix(y_test, y_pred, model_name, return_matrix = False):
model = load_model_results(model_name)
model_desc = model.model_desc
cmap = sns.color_palette('rocket_r', as_cmap=True)
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, square = True,
annot=True, annot_kws={'size': 4},
fmt='d', cbar=False, cmap=cmap,
xticklabels = class_labels,
yticklabels = class_labels,
)
plt.xlabel('Predicted Label', size = 8)
plt.ylabel('True Label', size = 8)
# Setting label size
plt.xticks(fontsize = 5)
plt.yticks(fontsize = 5)
plt.title(f'Confustion Matrix for {model_desc}')
plt.show()
if return_matrix:
return(cm)
plot_confusion_matrix(y_test, y_pred, 'model_cnn_1')
display_evaluation_results('model_cnn_1')
Test Loss: 0.11 Test Accuracy: 96.2%
Our initial model performs well, but it took a long time during training to achieve high accuracy. Still, as we can see from the confusion matrix, there are a number of signs that the model does not always identify correctly. Looking at the samples below, we can see why - some signs look so similar in low resolution.
display_random_samples()
Back to Table of Contents
# Creating a list of L2 regularisation weights
l2_weights = [1e-5, 1e-4, 1e-3, 1e-2, 1e-1]
# Create empty lists to store models, params, and validation results
l2_models = [None] * len(l2_weights)
l2_params = [None] * len(l2_weights)
l2_val_results = [None] * len(l2_weights)
# Building and training CNN models with different L2 weights
for i, l2 in enumerate(l2_weights):
# Setting model description and model name
model_desc = f'CNN Model (Section 3): L2 = {l2}'
model_name = f'model_cnn_2_l2_{i}'
# Displaying the header
display(HTML(f'<h4>{model_desc}</h4>'))
# Building the model with specifid L2 weight
l2_models[i], l2_params[i] = build_cnn_model(l2_rate = l2, display_summary = False)
# Training the model
train_model(model = l2_models[i], model_name = model_name, model_desc = model_desc, params = l2_params[i])
# Retrieving validation results for the future use
l2_val_results[i] = load_model_results(model_name).val_results
# Plotting the learning curves for the current model
plot_learning_curves(model_name)
# Displaying the evaluation results
display_evaluation_results(model_name, 2)
Model model_cnn_2_l2_0 has been trained successfully.
Validation Loss: 0.146 Validation Accuracy: 95.4%
Model model_cnn_2_l2_1 has been trained successfully.
Validation Loss: 0.176 Validation Accuracy: 95.6%
Model model_cnn_2_l2_2 has been trained successfully.
Validation Loss: 0.196 Validation Accuracy: 96.5%
Model model_cnn_2_l2_3 has been trained successfully.
Validation Loss: 0.295 Validation Accuracy: 95.9%
Model model_cnn_2_l2_4 has been trained successfully.
Validation Loss: 0.137 Validation Accuracy: 96.5%
# Setting the criterium for model selection
criterium = 'accuracy' # or 'loss'
# Setting the index for the test result list
if criterium == 'accuracy':
criterium_index = 1
else:
criterium_index = 0
best_value = 0
best_model_index = 0
best_l2_weight = 0
# Selecting the best model based on accuracy:
for i, l2 in enumerate(l2_weights):
if best_value < l2_val_results[i][criterium_index]:
best_value = l2_val_results[i][criterium_index]
best_model_index = i
best_model_l2_weight = l2
best_model_l2_name = f'model_cnn_2_l2_{i}'
# Getting the best model params
best_model_l2 = load_model_results(best_model_l2_name)
best_model_l2_test_results = best_model_l2.test_results
best_model_l2_val_results = best_model_l2.val_results
pattern = r"Nodes:\s*\[([0-9,\s]+)\]"
match = re.search(pattern, best_model_l2.model_params)
if match:
list_str = match.group(1)
num_list = [int(num_str) for num_str in list_str.split(',')]
best_model_l2_config = num_list
print(f'Based on {criterium}, the best model is {best_model_l2_name} with L2 = {best_model_l2_weight}.')
display_evaluation_results(best_model_l2_name, 2)
Based on accuracy, the best model is model_cnn_2_l2_4 with L2 = 0.1.
Validation Loss: 0.137 Validation Accuracy: 96.5%
From the learning curves we can observe that the L2 regularisation mostly effects the gap between training and validation loss curves, after enough epochs indicating that the models have learned effectively.
All things being relatively equal (similar loss and accuracy), it's better to select the model with the minimal gap between loss curves.
Back to Table of Contents
# Creating a list of different layer configurations
layer_configs = [[16, 32, 64], [32, 64], [16, 32]]
# Create an empty list to store models, parameters and validation results
layer_config_models = [None] * len(layer_configs)
layer_config_params = [None] * len(layer_configs)
layer_config_val_results = [None] * len(layer_configs)
# Training the CNN model with different L2 weights
for i, layer_config in enumerate(layer_configs):
# Train the model and save the results in the list
model_desc = f'CNN Model (Section 3): Layer Configuration: {layer_config}'
model_name = f'model_cnn_2_layer_config_{i}'
display(HTML(f'<h4>{model_desc}</h4>'))
layer_config_models[i], layer_config_params[i] = build_cnn_model(hidden_layers_config = layer_config,
l2_rate = best_model_l2_weight,
display_summary = False)
train_model(model = layer_config_models[i], model_desc = model_desc, params = layer_config_params[i], model_name = model_name)
layer_config_val_results[i] = load_model_results(model_name).val_results
# Plotting the learning curves for the current model
plot_learning_curves(model_name)
# Displaying the evaluation results
display_evaluation_results(model_name, 2)
Model model_cnn_2_layer_config_0 has been trained successfully.
Validation Loss: 0.18 Validation Accuracy: 96.1%
Model model_cnn_2_layer_config_1 has been trained successfully.
Validation Loss: 0.283 Validation Accuracy: 92.4%
Model model_cnn_2_layer_config_2 has been trained successfully.
Validation Loss: 0.316 Validation Accuracy: 91.7%
# Adding the best model from #3.1. to the list of layer configurations for later comparison
layer_configs.append(best_model_l2_config)
layer_config_val_results.append(best_model_l2_val_results)
# Creating variables to store best values
best_value = 0
best_model_index = 0
best_layer_config = []
best_val_results = []
# Selecting the best model based on accuracy:
for i, layer_config in enumerate(layer_configs):
if best_value < layer_config_val_results[i][criterium_index]:
best_value = layer_config_val_results[i][criterium_index]
best_model_layer_config_index = i
best_layer_config = layer_config
# Checking if the current model is the one from the previous section (L2 Regularisation)
if i == len(layer_configs) - 1:
best_model_layer_config_name = best_model_l2_name
else:
best_model_layer_config_name = f'model_cnn_2_layer_config_{i}'
best_val_results = layer_config_val_results[i]
print(f'Based on {criterium}, the best model is {best_model_layer_config_name} \
with {best_layer_config} layer configuration and L2 = {best_model_l2_weight}.')
Based on accuracy, the best model is model_cnn_2_l2_4 with [32, 64, 128] layer configuration and L2 = 0.1.
display_evaluation_results(best_model_layer_config_name, 3)
Validation Loss: 0.137 Validation Accuracy: 96.5%
Test Loss: 0.139 Test Accuracy: 96.5%
Back to Table of Contents
# Unpaking loss and accuracy values from the validation results
l2_losses, l2_accuracies = zip(*l2_val_results) # splitting the pairs into separate tuples
# Ensuring the lengths of l2_weights, losses, and accuracies are the same
assert len(l2_weights) == len(l2_losses) == len(l2_accuracies), 'The lists/tuples are of different lengths.'
# Plotting the loss and accuracy values vs the different L2 weights
# Plotting accuracy values
fig, ax_loss = plt.subplots(figsize = (8,4))
# Creating an array with the position of each bar along the x-axis
x_pos = np.arange(len(l2_weights))
color = 'tab:red'
ax_loss.bar(x_pos - 0.2, l2_losses, 0.4, color = color, label = 'Loss')
ax_loss.tick_params(axis='y', labelcolor = color)
ax_loss.set_xlabel('L2 Weights')
ax_loss.set_ylabel('Loss', color=color)
ax_acc = ax_loss.twinx() # instantiate a second axes that shares the same x-axis
color = 'tab:blue'
ax_acc.set_ylabel('Accuracy (%)', color=color) # we already handled the x-label with ax1
ax_acc.bar(x_pos + 0.2 , [accuracy * 100 for accuracy in l2_accuracies], 0.4, color=color, label = 'Accuracy')
ax_acc.tick_params(axis='y', labelcolor=color)
# Configuring x-axis
ax_loss.set_xticks(x_pos)
# Using a LaTeX formatter for the x-tick labels
ax_loss.set_xticklabels([r'$10^{%d}$' % math.log10(i) for i in l2_weights], fontsize = '8')
# Converting y-axis to percentage
fmt = '%.0f%%' # Format you want the ticks, e.g. '40%'
yticks = ticker.FormatStrFormatter(fmt)
ax_acc.yaxis.set_major_formatter(yticks)
fig.tight_layout() # otherwise the right y-label is slightly clipped
plt.title('L2 Weights vs. Validation Loss and Accuracy')
# Adding the grid
plt.grid(True, linestyle='--', linewidth=0.5, color='gray', alpha=0.5)
# Adding vertical grid lines
ax_loss.grid(axis='x', linestyle='--', linewidth=0.5, color='gray', alpha=0.5)
# Adding caption
plt.subplots_adjust(bottom=0.2) # adjusting the bottom upwards to make room for caption
plt.figtext(0.5, 0.02, "Figure 3.3.1: Hyper-Parameter Optimisation - L2 Regularisation",
ha="center", fontsize=10, bbox={"facecolor":"orange", "alpha":0.1, "pad":5})
plt.show()
# Unpaking loss and accuracy values from the validation results
losses, accuracies = zip(*layer_config_val_results) # splitting the pairs into separate tuples
# Ensuring the lengths of l2_weights, losses, and accuracies are the same
assert len(layer_configs) == len(losses) == len(accuracies), 'The lists/tuples are of different lengths.'
# Converting the layer configutations to text labels
# "lambda x: 'Conv'+str(x)" takes an integer x and returns a string 'Conv' ending with the integer (converted to a string)
# "map" applies the lambda function to each integer in the sublist
# "join" concatenates these strings together, separating them with a comma and a space
# "[... for sublist in layer_configs]" - list comprehension that performs the transformation to each sublist in layer_configs list
layer_configs_text = [', '.join(map(lambda x: 'Conv'+str(x), sublist)) for sublist in layer_configs]
# Plotting the loss and accuracy values vs the different L2 weights
# Plotting accuracy values
fig, ax_loss = plt.subplots(figsize = (8,4))
# Creating an array with the position of each bar along the x-axis
x_pos = np.arange(len(layer_configs_text))
color = 'tab:red'
ax_loss.bar(x_pos - 0.2, losses, 0.4, color=color, label = 'Loss')
ax_loss.tick_params(axis='y', labelcolor=color)
ax_loss.set_xlabel('Layers Configuation')
ax_loss.set_ylabel('Loss', color=color)
ax_acc = ax_loss.twinx() # instantiate a second axes that shares the same x-axis
color = 'tab:blue'
ax_acc.set_ylabel('Accuracy (%)', color=color)
ax_acc.bar(x_pos + 0.2, [accuracy * 100 for accuracy in accuracies], 0.4, color=color, label = 'Accuracy')
ax_acc.tick_params(axis='y', labelcolor=color)
# Configuring x-axis
ax_loss.set_xticks(x_pos)
ax_loss.set_xticklabels(layer_configs_text, fontsize = '8')
# Converting y axis to percentage
fmt = '%.0f%%' # Format you want the ticks, e.g. '40%'
yticks = ticker.FormatStrFormatter(fmt)
ax_acc.yaxis.set_major_formatter(yticks)
# Setting the layout
fig.tight_layout()
# Setting the title
plt.title('Different Layer Configuations vs. Validation Loss and Accuracy')
# Showing the grid
plt.grid(True)
# Adding caption
plt.subplots_adjust(bottom=0.2) # adjusting the bottom upwards to make room for caption
plt.figtext(0.5, 0.02, "Figure 3.3.2: Hyper-Parameter Optimisation - Configuration of Hidden Layers",
ha="center", fontsize=10, bbox={"facecolor":"orange", "alpha":0.1, "pad":5})
# Displaying the plot
plt.show()
Back to Table of Contents
To determine the best hyper-parameter values we select a model that achieves the highest accuracy (blue values on the graphs) and/or the lowest loss (red values on the graphs) based on validation set, not the test set. This avoids overfitting to the test set, which can lead to overly optimistic performance estimates.
The best validation accuracy is typically the highest accuracy achieved during the training process, and the best loss is the lowest loss achieved.
We can extract lists that contain the validation accuracy and validation loss at each epoch from the history object of a trained model.
However, while selecting the best model based on these metrics, it's important to also consider the state of the model at that point. Simply picking the highest validation accuracy or lowest validation loss could result in a model that is overfitted to the training data. In many cases, using techniques like early stopping (where training is halted when the validation loss stops improving) can be beneficial in selecting the best model.
Since we implemented early stopping technique for our model training, we can select the last value from history['val_accuracy'] array as our validation accuracy and the last value from from the history['val_loss'] array as our validation loss.
From the first plot (Figure 3.3.1) we can observe the effect of L2 regularisation on the models performance - for some values of L2 weights it significantly reduces the performance of our model.
From the second plot (Figure 3.3.2) we can see that:
print(f'Based on validation {criterium}, the best model is {best_model_layer_config_name} \
with {best_layer_config} layer configuration and L2 = {best_model_l2_weight}.')
Based on validation accuracy, the best model is model_cnn_2_l2_4 with [32, 64, 128] layer configuration and L2 = 0.1.
Back to Table of Contents
We will defining a function that builds residual modules with two convolutional layers and a skip connection that adds the input to the output of the second layer, prior to the use of the activation function.
Our function will take the following arguments:
The kernel_size must be a positive integer, or a tuple o
# Defining a function that builds a residual module
def build_residual_module(input_tensor, num_filters, kernel_size, padding = 'same', activation = 'relu', kernel_initializer = 'he_uniform'):
# Setting the first Conv2D layer
x = keras.layers.Conv2D(num_filters, kernel_size, padding = padding, kernel_initializer = kernel_initializer)(input_tensor)
x = keras.layers.BatchNormalization()(x)
x = keras.layers.Activation(activation)(x)
# Setting the second Conv2D layer
x = keras.layers.Conv2D(num_filters, kernel_size, padding = padding, kernel_initializer = kernel_initializer)(x)
x = keras.layers.BatchNormalization()(x)
# If the number of filters in the input tensor and x are different,
# use a 1x1 Conv2D layer to adjust the number of filters in the input tensor
if keras.backend.int_shape(input_tensor)[-1] != num_filters:
input_tensor = keras.layers.Conv2D(num_filters, (1, 1), padding='same')(input_tensor)
# Adding the skip connection (input_tensor) to the output of the second layer
x = keras.layers.Add()([x, input_tensor])
# Adding the activation function
x = keras.layers.Activation(activation)(x)
return x
Back to Table of Contents
# Defining a function to build a CNN with Residual Modules with different configutations
def build_resnet_model(hidden_layers_config = hidden_layers_config,
kernel_size = kernel_size,
l2_rate = 0, # setting the default L2 regularisation to 0
activation_hidden = activation_hidden,
activation_out = activation_out,
input_shape = input_shape,
output_size = output_size,
optimisation_technique = optimisation_technique,
loss = loss, # loss function
display_summary = True,
kernel_initializer = 'he_uniform'
):
# Initialising a sequential model
inputs = keras.Input(shape=input_shape)
x = inputs
# Creating blocks of hidden layers using the specified configuration
for i, num_filters in enumerate(hidden_layers_config):
x = build_residual_module(x, num_filters, kernel_size)
x = keras.layers.MaxPooling2D(pool_size = 2)(x)
# Flattening the output
# x = keras.layers.Flatten()(x)
x = keras.layers.GlobalAvgPool2D()(x)
# Adding the final dense layer
outputs = keras.layers.Dense(output_size, activation = activation_out, kernel_initializer = kernel_initializer)(x)
model = keras.Model(inputs, outputs)
# Setting the optimizer
# Define the optimizer based on the provided parameter
if optimisation_technique.lower() == "adam":
optimizer = keras.optimizers.Adam()
elif optimisation_technique.lower() == "sgd":
optimizer = keras.optimizers.SGD()
elif optimisation_technique.lower() == "rmsprop":
optimizer = keras.optimizers.RMSprop()
elif optimisation_technique.lower() == "nadam":
optimizer = keras.optimizers.Nadam()
elif optimisation_technique.lower() == "adamax":
optimizer = keras.optimizers.Adamax()
else:
raise ValueError("Invalid optimizer provided")
# Compiling the model
model.compile(loss = loss,
optimizer = optimizer,
metrics = ["accuracy"]) # Including the additional metric of 'accuracy'
# Building the parameters string
params = f'Nodes: {hidden_layers_config}\n\
L2 Weight: {l2_rate}\n\
Activation (Hidden): {activation_hidden}\n\
Activation (Out): {activation_out}'
if display_summary:
model.summary()
print(params)
return(model, params)
model_resnet, params_resnet = build_resnet_model(hidden_layers_config = best_layer_config)
Model: "model"
__________________________________________________________________________________________________
Layer (type) Output Shape Param # Connected to
==================================================================================================
input_1 (InputLayer) [(None, 28, 28, 1)] 0 []
conv2d (Conv2D) (None, 28, 28, 32) 320 ['input_1[0][0]']
batch_normalization (BatchNorm (None, 28, 28, 32) 128 ['conv2d[0][0]']
alization)
activation (Activation) (None, 28, 28, 32) 0 ['batch_normalization[0][0]']
conv2d_1 (Conv2D) (None, 28, 28, 32) 9248 ['activation[0][0]']
batch_normalization_1 (BatchNo (None, 28, 28, 32) 128 ['conv2d_1[0][0]']
rmalization)
conv2d_2 (Conv2D) (None, 28, 28, 32) 64 ['input_1[0][0]']
add (Add) (None, 28, 28, 32) 0 ['batch_normalization_1[0][0]',
'conv2d_2[0][0]']
activation_1 (Activation) (None, 28, 28, 32) 0 ['add[0][0]']
max_pooling2d (MaxPooling2D) (None, 14, 14, 32) 0 ['activation_1[0][0]']
conv2d_3 (Conv2D) (None, 14, 14, 64) 18496 ['max_pooling2d[0][0]']
batch_normalization_2 (BatchNo (None, 14, 14, 64) 256 ['conv2d_3[0][0]']
rmalization)
activation_2 (Activation) (None, 14, 14, 64) 0 ['batch_normalization_2[0][0]']
conv2d_4 (Conv2D) (None, 14, 14, 64) 36928 ['activation_2[0][0]']
batch_normalization_3 (BatchNo (None, 14, 14, 64) 256 ['conv2d_4[0][0]']
rmalization)
conv2d_5 (Conv2D) (None, 14, 14, 64) 2112 ['max_pooling2d[0][0]']
add_1 (Add) (None, 14, 14, 64) 0 ['batch_normalization_3[0][0]',
'conv2d_5[0][0]']
activation_3 (Activation) (None, 14, 14, 64) 0 ['add_1[0][0]']
max_pooling2d_1 (MaxPooling2D) (None, 7, 7, 64) 0 ['activation_3[0][0]']
conv2d_6 (Conv2D) (None, 7, 7, 128) 73856 ['max_pooling2d_1[0][0]']
batch_normalization_4 (BatchNo (None, 7, 7, 128) 512 ['conv2d_6[0][0]']
rmalization)
activation_4 (Activation) (None, 7, 7, 128) 0 ['batch_normalization_4[0][0]']
conv2d_7 (Conv2D) (None, 7, 7, 128) 147584 ['activation_4[0][0]']
batch_normalization_5 (BatchNo (None, 7, 7, 128) 512 ['conv2d_7[0][0]']
rmalization)
conv2d_8 (Conv2D) (None, 7, 7, 128) 8320 ['max_pooling2d_1[0][0]']
add_2 (Add) (None, 7, 7, 128) 0 ['batch_normalization_5[0][0]',
'conv2d_8[0][0]']
activation_5 (Activation) (None, 7, 7, 128) 0 ['add_2[0][0]']
max_pooling2d_2 (MaxPooling2D) (None, 3, 3, 128) 0 ['activation_5[0][0]']
global_average_pooling2d (Glob (None, 128) 0 ['max_pooling2d_2[0][0]']
alAveragePooling2D)
dense (Dense) (None, 25) 3225 ['global_average_pooling2d[0][0]'
]
==================================================================================================
Total params: 301,945
Trainable params: 301,049
Non-trainable params: 896
__________________________________________________________________________________________________
Nodes: [32, 64, 128]
L2 Weight: 0
Activation (Hidden): ReLU
Activation (Out): Softmax
keras.utils.plot_model(model_resnet, ".\_images\model_resnet.png")
Back to Table of Contents
# Training the ResNet model and retrieving history and test results
train_model(model = model_resnet,
model_desc = 'ResNet Model (Section 4)',
params = params_resnet,
early_stopping_cb = EarlyStopping(monitor='val_loss',
patience = 5,
restore_best_weights = True
)
)
Model model_resnet has been trained successfully.
Back to Table of Contents
# Plotting the learning curves for the model
plot_learning_curves('model_resnet')
display_evaluation_results('model_resnet', 3)
Validation Loss: 0.031 Validation Accuracy: 99.3%
Test Loss: 0.031 Test Accuracy: 99.4%
Back to Table of Contents
From the learning curves we can observe that the ResNet model performs extremely well even without regularisation:
ResNets feature a skip connection (shortcut) that allows the gradient to be directly propagated to earlier layers (unlike in traditional CNN architecture). Because of that the ResNet model has a number of advantages that we can observe here:
Back to Table of Contents
We previously selected the best CNN network (see 3.4. What Are the Best Hyper-Parameter Values to Use?) from Section 2 and 3.
print(f'Based on validation {criterium}, the best model (from Part 2 and 3) is \
{best_model_layer_config_name} with {best_layer_config} and L2 = {best_model_l2_weight}.')
Based on validation accuracy, the best model (from Part 2 and 3) is model_cnn_2_l2_4 with [32, 64, 128] and L2 = 0.1.
# Retrieving model history and test restuls
plot_learning_curves(best_model_layer_config_name)
display_evaluation_results(best_model_layer_config_name, 3)
Validation Loss: 0.137 Validation Accuracy: 96.5%
Test Loss: 0.139 Test Accuracy: 96.5%
# Plotting the learning curves for the model
plot_learning_curves('model_resnet')
display_evaluation_results('model_resnet', 3)
Validation Loss: 0.031 Validation Accuracy: 99.3%
Test Loss: 0.031 Test Accuracy: 99.4%
Choosing the best model involves considering several factors:
Validation and Test Accuracy: this gives you an idea of how well the model generalises to new, unseen data. The model with the highest test accuracy is typically considered the best. However, accuracy alone should not be the deciding factor, especially if the differences are very small or if the dataset is imbalanced.
Validation and Test Loss: The model with the lowest validation and test loss is generally the one that has learned to generalise better. However, again, if the differences are very small, this should not be the only deciding factor.
Learning Curves: By examining the learning curves, we can see whether the model has learned effectively. Ideally, the model should reach a point where the training and validation loss decrease to a point of stability with a minimal gap between the two final loss values.
Overfitting: If our model performs well on the training data but poorly on the validation data, it may be overfitting.
Underfitting: If it performs poorly on both training and validation data, it might be underfitting.
Model Complexity: More complex models can take longer to train and require more computational resources. If two models perform similarly, it might be better to choose the simpler one.
Considering these factors, we can select ResNet is the best network, because:
model_best = model_resnet
Back to Table of Contents
# Getting predicted probabilities for each class
y_pred_prob_best = model_best.predict(X_test_scaled)
113/113 [==============================] - 2s 13ms/step
# Assigning the label class with the highest probability to each sample
y_pred_best = np.argmax(y_pred_prob_best, axis=1)
# Printing the metrics to measure the performance of the classifier
print(classification_report(y_test,
y_pred_best,
target_names = class_labels
))
precision recall f1-score support
A (0) 1.00 1.00 1.00 166
B (1) 1.00 1.00 1.00 216
C (2) 1.00 1.00 1.00 155
D (3) 1.00 1.00 1.00 122
E (4) 1.00 1.00 1.00 249
F (5) 1.00 1.00 1.00 124
G (6) 1.00 0.95 0.98 174
H (7) 1.00 1.00 1.00 218
I (8) 0.93 1.00 0.96 144
K (10) 1.00 1.00 1.00 166
L (11) 1.00 1.00 1.00 104
M (12) 1.00 1.00 1.00 197
N (13) 1.00 1.00 1.00 145
O (14) 1.00 1.00 1.00 123
P (15) 1.00 1.00 1.00 173
Q (16) 1.00 1.00 1.00 82
R (17) 1.00 0.99 0.99 72
S (18) 1.00 1.00 1.00 123
T (19) 0.94 0.99 0.96 124
U (20) 0.99 1.00 1.00 133
V (21) 1.00 1.00 1.00 173
W (22) 1.00 1.00 1.00 103
X (23) 0.99 1.00 1.00 134
Y (24) 1.00 0.93 0.97 166
accuracy 0.99 3586
macro avg 0.99 0.99 0.99 3586
weighted avg 0.99 0.99 0.99 3586
cm_best = plot_confusion_matrix(y_test, y_pred_best, 'model_resnet', return_matrix = True)
Back to Table of Contents
We can find the most often incorrectly classified class by finding a class with the highest proportion of incorrectly classified instances in comparison to the total number of instances for this class.
# Counting the number of instances for each unique class
unique_classes, true_class_count = np.unique(y_test, return_counts = True)
# Getting the number of correctly classified instances for each class from the diagonal of confusion matrix
correctly_classified_count = np.diag(cm_best)
# Getting the proportion of incorrectly classified instances for each class
proportion_incorrectly_classified = (true_class_count - correctly_classified_count) / true_class_count
# Getting the index of the class with the highest proportion of incorrectly classified instances
most_misclassified_class_index = np.argmax(proportion_incorrectly_classified)
print(f'The most often incorrectly classified class: {class_labels[most_misclassified_class_index]}. \
It has the highest proportion of incorrectly classfied instances in comparison to the total number of instances for that class: \
{proportion_incorrectly_classified[most_misclassified_class_index]*100:.2f}%.')
The most often incorrectly classified class: Y (24). It has the highest proportion of incorrectly classfied instances in comparison to the total number of instances for that class: 6.63%.
display_random_samples()
Back to Table of Contents
In this project we experimented with convolutional neural network (CNN) classifiers using an MNIST-like dataset based on American Sign Language. We built several different CNN networks, optimised hyper-parameters (performed L2 regularisation and optimised the number of layers and layer sizes), as well as implemented a ResNet version. We evaluated results obtained by these classifiers, compared their accuracy and loss scores on validation and test datasets and chose the best one.
The ResNet produced the best results (more than 99% accuracy), however we only considered the 'strait-out-of-the-box' configuration. Additional hyper-parameter optimisation has a potential to improve its performance even further.
Back to Table of Contents